Load Dataset
input_file <- "CD_additional_balanced.csv"
work_dir <- getwd()
setwd(work_dir)
cd_data <- read.csv(file = input_file, stringsAsFactors = F)
Inspect and factor dataset
cd_dt <- data.table(cd_data)
factor_cols <- c("job", "marital", "education", "default", "housing", "loan", "contact", "month", "day_of_week", "poutcome", "y")
for (col in factor_cols)
cd_dt[, (col) := as.factor(cd_dt[[col]])]
str(cd_dt)
## Classes 'data.table' and 'data.frame': 9280 obs. of 21 variables:
## $ age : int 41 49 49 41 45 42 39 28 44 42 ...
## $ job : Factor w/ 12 levels "admin.","blue-collar",..: 2 3 10 10 2 2 4 12 8 10 ...
## $ marital : Factor w/ 4 levels "divorced","married",..: 1 2 2 2 2 2 2 3 2 2 ...
## $ education : Factor w/ 8 levels "basic.4y","basic.6y",..: 1 7 3 6 3 3 3 8 4 6 ...
## $ default : Factor w/ 2 levels "no","unknown": 2 2 1 2 2 1 1 2 1 1 ...
## $ housing : Factor w/ 3 levels "no","unknown",..: 3 3 1 3 3 3 3 3 3 1 ...
## $ loan : Factor w/ 3 levels "no","unknown",..: 1 1 1 1 1 3 1 3 1 1 ...
## $ contact : Factor w/ 2 levels "cellular","telephone": 2 2 2 2 2 2 2 2 2 2 ...
## $ month : Factor w/ 10 levels "apr","aug","dec",..: 7 7 7 7 7 7 7 7 7 7 ...
## $ day_of_week : Factor w/ 5 levels "fri","mon","thu",..: 2 2 2 2 2 2 2 4 4 4 ...
## $ duration : int 1575 1042 1467 579 461 673 935 1201 1030 1623 ...
## $ campaign : int 1 1 1 1 1 2 3 1 1 1 ...
## $ pdays : int 999 999 999 999 999 999 999 999 999 999 ...
## $ previous : int 0 0 0 0 0 0 0 0 0 0 ...
## $ poutcome : Factor w/ 3 levels "failure","nonexistent",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ emp.var.rate : num 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 ...
## $ cons.price.idx: num 94 94 94 94 94 ...
## $ cons.conf.idx : num -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 ...
## $ euribor3m : num 4.86 4.86 4.86 4.86 4.86 ...
## $ nr.employed : num 5191 5191 5191 5191 5191 ...
## $ y : Factor w/ 2 levels "no","yes": 2 2 2 2 2 2 2 2 2 2 ...
## - attr(*, ".internal.selfref")=<externalptr>
## age job marital education
## Min. :17.0 admin. :2517 divorced:1021 university.degree :3007
## 1st Qu.:31.0 blue-collar:1769 married :5338 high.school :2102
## Median :38.0 technician :1459 single :2900 professional.course:1190
## Mean :40.4 services : 773 unknown : 21 basic.9y :1177
## 3rd Qu.:48.0 management : 651 basic.4y : 895
## Max. :98.0 retired : 595 basic.6y : 458
## (Other) :1516 (Other) : 451
## default housing loan contact month
## no :7824 no :4104 no :7688 cellular :6672 may :2533
## unknown:1456 unknown: 225 unknown: 225 telephone:2608 jul :1477
## yes :4951 yes :1367 aug :1353
## jun :1169
## nov : 886
## apr : 785
## (Other):1077
## day_of_week duration campaign pdays previous
## fri:1763 Min. : 1.0 Min. : 1.000 Min. : 0.0 Min. :0.0000
## mon:1846 1st Qu.: 145.0 1st Qu.: 1.000 1st Qu.:999.0 1st Qu.:0.0000
## thu:2000 Median : 265.0 Median : 2.000 Median :999.0 Median :0.0000
## tue:1810 Mean : 387.4 Mean : 2.333 Mean :887.3 Mean :0.3153
## wed:1861 3rd Qu.: 528.0 3rd Qu.: 3.000 3rd Qu.:999.0 3rd Qu.:0.0000
## Max. :4199.0 Max. :39.000 Max. :999.0 Max. :6.0000
##
## poutcome emp.var.rate cons.price.idx cons.conf.idx
## failure :1074 Min. :-3.4000 Min. :92.20 Min. :-50.80
## nonexistent:7244 1st Qu.:-1.8000 1st Qu.:92.89 1st Qu.:-42.70
## success : 962 Median :-0.1000 Median :93.44 Median :-41.80
## Mean :-0.4963 Mean :93.48 Mean :-40.22
## 3rd Qu.: 1.4000 3rd Qu.:93.99 3rd Qu.:-36.40
## Max. : 1.4000 Max. :94.77 Max. :-26.90
##
## euribor3m nr.employed y
## Min. :0.634 Min. :4964 no :4640
## 1st Qu.:1.244 1st Qu.:5076 yes:4640
## Median :4.021 Median :5191
## Mean :2.960 Mean :5135
## 3rd Qu.:4.959 3rd Qu.:5228
## Max. :5.045 Max. :5228
##
Histrograms: Age, Duration, Campaign, Pdays
hist(cd_dt$age, main = "Histogram of Age", xlab = "Age")

boxplot(cd_dt$age, main = "Boxplot of Age", ylab = "Age")

quantile(cd_dt$age, seq(from = 0, to = 1, by = 0.10))
## 0% 10% 20% 30% 40% 50% 60% 70% 80% 90% 100%
## 17 27 30 33 35 38 41 46 51 57 98
hist(cd_dt$duration, main = "Histogram of Duration", xlab = "Duration")

boxplot(cd_dt$duration, main = "Boxplot of Duration", ylab = "Duration")

quantile(cd_dt$duration, seq(from = 0, to = 1, by = 0.10))
## 0% 10% 20% 30% 40% 50% 60% 70% 80% 90% 100%
## 1 80 124 167 211 265 340 452 615 860 4199
hist(cd_dt$campaign, main = "Histogram of Campaign", xlab = "Campaign")

boxplot(cd_dt$campaign, main = "Boxplot of Campaign", ylab = "Campaign")

quantile(cd_dt$campaign, seq(from = 0, to = 1, by = 0.10))
## 0% 10% 20% 30% 40% 50% 60% 70% 80% 90% 100%
## 1 1 1 1 1 2 2 2 3 4 39
hist(cd_dt$pdays, main = "Histogram of Pdays", xlab = "Pdays")

boxplot(cd_dt$pdays, main = "Boxplot of Pdays", ylab = "Pdays")

quantile(cd_dt$pdays, seq(from = 0, to = 1, by = 0.10))
## 0% 10% 20% 30% 40% 50% 60% 70% 80% 90% 100%
## 0 11 999 999 999 999 999 999 999 999 999
CD subscription (y), job, education, poutcome
(job_table <- table(cd_dt$job))
##
## admin. blue-collar entrepreneur housemaid management
## 2517 1769 308 216 651
## retired self-employed services student technician
## 595 306 773 358 1459
## unemployed unknown
## 248 80
job_prop_table <- prop.table(job_table)
round(job_prop_table, digits = 2)
##
## admin. blue-collar entrepreneur housemaid management
## 0.27 0.19 0.03 0.02 0.07
## retired self-employed services student technician
## 0.06 0.03 0.08 0.04 0.16
## unemployed unknown
## 0.03 0.01
barplot(job_table, main = "Job")

(edu_table <- table(cd_dt$education))
##
## basic.4y basic.6y basic.9y high.school
## 895 458 1177 2102
## illiterate professional.course university.degree unknown
## 6 1190 3007 445
edu_prop_table <- prop.table(edu_table)
round(edu_prop_table, digits = 2)
##
## basic.4y basic.6y basic.9y high.school
## 0.10 0.05 0.13 0.23
## illiterate professional.course university.degree unknown
## 0.00 0.13 0.32 0.05
barplot(edu_table, main = "Education Level")

(poutcome_table <- table(cd_dt$poutcome))
##
## failure nonexistent success
## 1074 7244 962
poutcome_prop_table <- prop.table(poutcome_table)
round(poutcome_prop_table, digits = 2)
##
## failure nonexistent success
## 0.12 0.78 0.10
barplot(poutcome_table, main = "Previous Outcome")

(y_table <- table(cd_dt$y))
##
## no yes
## 4640 4640
y_prop_table <- prop.table(y_table)
round(y_prop_table, digits = 2)
##
## no yes
## 0.5 0.5
barplot(y_table, main = "CD Subscribed")

Variable Relationships
rel_cols <- c("age", "duration", "campaign", "pdays", "euribor3m", "emp.var.rate", "nr.employed")
cor(cd_dt[, ..rel_cols])
## age duration campaign pdays euribor3m
## age 1.000000000 -0.02072651 0.003690016 -0.05351616 -0.04462745
## duration -0.020726510 1.00000000 -0.025872465 0.02893622 0.05733951
## campaign 0.003690016 -0.02587247 1.000000000 0.08930062 0.17512283
## pdays -0.053516156 0.02893622 0.089300624 1.00000000 0.38773934
## euribor3m -0.044627449 0.05733951 0.175122827 0.38773934 1.00000000
## emp.var.rate -0.049052629 0.07144035 0.185736186 0.33488799 0.95840218
## nr.employed -0.074686516 0.05823209 0.176972215 0.47499217 0.94054583
## emp.var.rate nr.employed
## age -0.04905263 -0.07468652
## duration 0.07144035 0.05823209
## campaign 0.18573619 0.17697221
## pdays 0.33488799 0.47499217
## euribor3m 0.95840218 0.94054583
## emp.var.rate 1.00000000 0.86752989
## nr.employed 0.86752989 1.00000000
pairs.panels(cd_dt[, ..rel_cols])

boxplot(age ~ y, data = cd_dt)

aggregate(age ~ y, summary, data = cd_dt)
## y age.Min. age.1st Qu. age.Median age.Mean age.3rd Qu. age.Max.
## 1 no 17.00000 32.00000 38.00000 39.89375 47.00000 88.00000
## 2 yes 17.00000 31.00000 37.00000 40.91315 50.00000 98.00000
boxplot(duration ~ y, data = cd_dt)

aggregate(duration ~ y, summary, data = cd_dt)
## y duration.Min. duration.1st Qu. duration.Median duration.Mean
## 1 no 1.0000 94.0000 166.0000 221.5323
## 2 yes 37.0000 253.0000 449.0000 553.1912
## duration.3rd Qu. duration.Max.
## 1 279.2500 1994.0000
## 2 741.2500 4199.0000
boxplot(campaign ~ y, data = cd_dt)

aggregate(campaign ~ y, summary, data = cd_dt)
## y campaign.Min. campaign.1st Qu. campaign.Median campaign.Mean
## 1 no 1.000000 1.000000 2.000000 2.614871
## 2 yes 1.000000 1.000000 2.000000 2.051724
## campaign.3rd Qu. campaign.Max.
## 1 3.000000 39.000000
## 2 2.000000 23.000000
boxplot(pdays ~ y, data = cd_dt)

aggregate(pdays ~ y, summary, data = cd_dt)
## y pdays.Min. pdays.1st Qu. pdays.Median pdays.Mean pdays.3rd Qu. pdays.Max.
## 1 no 0.0000 999.0000 999.0000 982.5293 999.0000 999.0000
## 2 yes 0.0000 999.0000 999.0000 792.0356 999.0000 999.0000
boxplot(euribor3m ~ y, data = cd_dt)

aggregate(euribor3m ~ y, summary, data = cd_dt)
## y euribor3m.Min. euribor3m.1st Qu. euribor3m.Median euribor3m.Mean
## 1 no 0.635000 1.405000 4.857000 3.797283
## 2 yes 0.634000 0.849000 1.266000 2.123135
## euribor3m.3rd Qu. euribor3m.Max.
## 1 4.962000 4.970000
## 2 4.406000 5.045000
boxplot(emp.var.rate ~ y, data = cd_dt)

aggregate(emp.var.rate ~ y, summary, data = cd_dt)
## y emp.var.rate.Min. emp.var.rate.1st Qu. emp.var.rate.Median
## 1 no -3.4000000 -1.8000000 1.1000000
## 2 yes -3.4000000 -1.8000000 -1.8000000
## emp.var.rate.Mean emp.var.rate.3rd Qu. emp.var.rate.Max.
## 1 0.2409052 1.4000000 1.4000000
## 2 -1.2334483 -0.1000000 1.4000000
boxplot(nr.employed ~ y, data = cd_dt)

aggregate(nr.employed ~ y, summary, data = cd_dt)
## y nr.employed.Min. nr.employed.1st Qu. nr.employed.Median nr.employed.Mean
## 1 no 4963.600 5099.100 5195.800 5175.497
## 2 yes 4963.600 5017.500 5099.100 5095.116
## nr.employed.3rd Qu. nr.employed.Max.
## 1 5228.100 5228.100
## 2 5191.000 5228.100
Scatterplots
scatterplot3d(cd_dt$age, cd_dt$campaign, cd_dt$duration, highlight.3d = T, pch = as.numeric(cd_dt$y), main = "3D Scatterplot of CD data")
legend('topright', legend = levels(cd_dt$y), col = 1:2, cex = 0.8, pch = 1:2)

scatterplot3d(cd_dt$nr.employed, cd_dt$euribor3m, cd_dt$duration, highlight.3d = T, pch = as.numeric(cd_dt$y), main = "3D Scatterplot of CD data")
legend('topright', legend = levels(cd_dt$y), col = 1:2, cex = 0.8, pch = 1:2)
